# Always print this out before your assignment
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] knitr_1.36
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.28 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 rlang_0.4.11 stringi_1.7.5 jquerylib_0.1.4
## [9] bslib_0.3.1 rmarkdown_2.10 tools_4.1.1 stringr_1.4.0
## [13] xfun_0.25 yaml_2.2.1 fastmap_1.1.0 compiler_4.1.1
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "/Users/angpham/Desktop/CPSC_Courses/MGSC310/final_project"
# Load all your libraries in this chunk
library('tidyverse')
library('dplyr')
library('ggplot2')
library('ggridges')
# NOTE: Do not run install.packages() inside a code chunk. Install them in the console outside of a code chunk.
tracks <- read.csv(here::here("spotify_dataset", "spotify_tracks.csv"))
The follwing link details each of the features in the ‘tracks’ dataset: https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-tracks)
tracks %>% glimpse()
## Rows: 101,939
## Columns: 32
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ acousticness <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, …
## $ album_id <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9…
## $ analysis_url <chr> "https://api.spotify.com/v1/audio-analysis…
## $ artists_id <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6C…
## $ available_markets <chr> "['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG'…
## $ country <chr> "BE", "BE", "BE", "BE", "BE", "BE", "BE", …
## $ danceability <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, …
## $ disc_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ duration_ms <dbl> 235584, 656960, 492840, 316578, 558880, 18…
## $ energy <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.…
## $ href <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ id <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLS…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.0000…
## $ key <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, …
## $ liveness <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.…
## $ loudness <dbl> -7.447, -10.340, -13.605, -20.254, -13.749…
## $ lyrics <chr> "\n\nPerhaps I am bound to be restless\nAl…
## $ mode <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ name <chr> "Blood", "The Ugly Duckling", "Jimmy Launc…
## $ playlist <chr> "Hipsteribrunssi", "Animal Stories", "Best…
## $ popularity <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47,…
## $ preview_url <chr> "https://p.scdn.co/mp3-preview/1b05a902da3…
## $ speechiness <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.…
## $ tempo <dbl> 115.018, 115.075, 79.565, 112.822, 81.260,…
## $ time_signature <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, …
## $ track_href <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ track_name_prev <chr> "track_14", "track_3", "track_4", "track_9…
## $ track_number <dbl> 1, 3, 4, 1, 2, 8, 2, 11, 6, 6, 1, 3, 12, 5…
## $ uri <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "s…
## $ valence <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.…
## $ type <chr> "track", "track", "track", "track", "track…
Below is a summary of the relevant features that will be used for this question. This includes genral information about a track, popularity, and its audio features:
- id: ID of th track, as identified by Spotify
- name: Name of track
- href: Link to Spotify API for complete information about a track
- uri: Unique identifier to access track on Spotify
- artists_id: List of artists IDs for a track
- album_id: Album ID of the album the track is a part of
- duration_ms: Length of track
- popularity: Popularity score for a track based on Spotify algorithms (Spotify indicates that it is largely based on number of plays.)
- acousticness: Confidence measure from 0.0 to 1.0 of whether the track is acoustic
- danceability: How suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity
- energy: Measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity
- instrumentalness: Predicts whether a track contains no vocals
- key: Key the track is in. Integers map to pitches using standard Pitch Class notation
- liveness: Detects the presence of an audience in the recording
- loudness: Quality of a sound that is the primary psychological correlate of physical strength (amplitude), measured in decibels (dB)
- mode: Modality (major or minor) of a track
- speechiness: Presence of spoken words in a track
- tempo: Speed or pace of a given piece in beats per minute (BPM)
- time_signature: Notational convention to specify how many beats are in each bar (or measure)
- valence: Musical positiveness conveyed by a track
tracks_clean <- na.omit(tracks) # Omit any tracks with an na value in any column
tracks_clean <- tracks[, c("id", "name", "href", "uri", "artists_id", "album_id", "duration_ms", "popularity", "acousticness", "danceability", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence")]
tracks_clean %>% glimpse()
## Rows: 101,939
## Columns: 20
## $ id <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…
summary(tracks_clean)
## id name href
## Length:101939 Length:101939 Length:101939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## uri artists_id album_id
## Length:101939 Length:101939 Length:101939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## duration_ms popularity acousticness danceability
## Min. : 1155 Min. : 0.00 Min. :0.0000 Min. :0.000
## 1st Qu.: 184000 1st Qu.:29.00 1st Qu.:0.0407 1st Qu.:0.480
## Median : 216893 Median :41.00 Median :0.2380 Median :0.610
## Mean : 246771 Mean :39.78 Mean :0.3521 Mean :0.586
## 3rd Qu.: 261055 3rd Qu.:52.00 3rd Qu.:0.6450 3rd Qu.:0.714
## Max. :5505831 Max. :97.00 Max. :0.9960 Max. :0.989
## energy instrumentalness key
## Min. :0.0000 Min. :0.0000000 Min. : 0.000
## 1st Qu.:0.4110 1st Qu.:0.0000000 1st Qu.: 2.000
## Median :0.6290 Median :0.0000375 Median : 5.000
## Mean :0.5865 Mean :0.1487759 Mean : 5.271
## 3rd Qu.:0.7980 3rd Qu.:0.0344000 3rd Qu.: 8.000
## Max. :1.0000 Max. :1.0000000 Max. :11.000
## liveness loudness mode speechiness
## Min. :0.0000 Min. :-60.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0956 1st Qu.:-11.149 1st Qu.:0.0000 1st Qu.:0.0364
## Median :0.1240 Median : -7.599 Median :1.0000 Median :0.0506
## Mean :0.1976 Mean : -9.463 Mean :0.6182 Mean :0.1288
## 3rd Qu.:0.2410 3rd Qu.: -5.509 3rd Qu.:1.0000 3rd Qu.:0.1040
## Max. :0.9990 Max. : 2.719 Max. :1.0000 Max. :0.9690
## tempo time_signature valence
## Min. : 0.00 Min. :0.000 Min. :0.0000
## 1st Qu.: 95.97 1st Qu.:4.000 1st Qu.:0.2710
## Median :118.07 Median :4.000 Median :0.4770
## Mean :118.36 Mean :3.876 Mean :0.4828
## 3rd Qu.:136.04 3rd Qu.:4.000 3rd Qu.:0.6930
## Max. :244.03 Max. :5.000 Max. :0.9930
Below is a histogram showing the spread of the ‘popularity’ variable. From the plot below, we can see that the ‘popularity’ variable is normally distributed from 0 - 100, with 100 indicating most popular. The ‘popularity’ varaible is not ordered by rank. Spotify indicates it largely based on number of plays, but includes other factors as well.
ggplot(tracks_clean, aes(tracks_clean$popularity)) + geom_histogram(binwidth = 5) +
theme_minimal()
The following plots show relationships between the popularity scores and each audio feature. The plots are not linear; however, some plots, like danceability and energy clearly show that higher values in those audio features indicate more popular music, as indicated by darker points in the top right hand corner.
ggplot(tracks_clean, aes(acousticness, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(danceability, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(energy, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(instrumentalness, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(key, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(liveness, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(loudness, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(mode, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(speechiness, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(tempo, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(time_signature, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
ggplot(tracks_clean, aes(valence, popularity)) + geom_point(alpha = 0.05) +
geom_smooth() +
theme_minimal()
The boxplot below shows a clearer picture of the distribution of the ‘popularity’ variable. Our group has decided that any scores above the 75th percentile will be considered a hit. The 75th percentile is at the popularity score of 52.00, as indicated by the next code chunk.
ggplot(tracks_clean, aes(popularity)) + geom_boxplot() +
theme_minimal()
percentile_75 <- quantile(tracks_clean$popularity, prob = 0.75)
sprintf("75th Percentile = %i", as.numeric(percentile_75))
## [1] "75th Percentile = 52"
# Add binary 'hit' variable to 'tracks_clean' dataset
tracks_hit <- tracks_clean %>% mutate(hit = ifelse(popularity > percentile_75, 1, 0) %>% factor(., levels = c("0","1")),)
tracks_hit %>% glimpse()
## Rows: 101,939
## Columns: 21
## $ id <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…
## $ hit <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
The following plots show relationships between the popularity scores and each audio feature. Most of the boxplots for each audio feature look the same whether they are a hit or not, indicating they may not be the strongest audio features in determining popularity. However, the difference seen in the ‘dancebility’ and ‘energy’ box plots may indicate that they may be strong features in determing popularity.
ggplot(tracks_hit, aes(x = hit, y = acousticness, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = danceability, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = energy, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = instrumentalness, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = key, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = liveness, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = loudness, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = mode, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = speechiness, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = tempo, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = time_signature, fill = popularity)) + geom_boxplot() +
theme_minimal()
ggplot(tracks_hit, aes(x = hit, y = valence, fill = popularity)) + geom_boxplot() +
theme_minimal()
artist_genres <- read.csv(here::here("spotify_dataset", "artist_genres.csv"))
artist_genres_clean <- na.omit(artist_genres) # Omit any tracks with an na value in any column
artist_genres_clean %>% glimpse()
## Rows: 34,680
## Columns: 11
## $ Artist <chr> "Juliano Cezar", "The Grenadines", "Gangway", "…
## $ ID <chr> "4mGnpjhqgx4RUdsIJiURdo", "1dLnVku4VQUOLswwDFvR…
## $ Genre <chr> "other", "rock", "pop", "pop", "other", "electr…
## $ acousticness <dbl> 0.2651667, 0.0331000, 0.3940000, 0.0122000, 0.9…
## $ danceability <dbl> 0.4486667, 0.5150000, 0.7010000, 0.3480000, 0.3…
## $ energy <dbl> 0.6696667, 0.3730000, 0.6260000, 0.5400000, 0.1…
## $ loudness <dbl> -4.549000, -9.872000, -11.246000, -8.051000, -1…
## $ liveness <dbl> 0.2873333, 0.0822000, 0.0813000, 0.1110000, 0.1…
## $ speechiness <dbl> 0.0345000, 0.0241000, 0.0268000, 0.0415000, 0.0…
## $ tempo <dbl> 100.22567, 95.00500, 97.98800, 103.56500, 151.9…
## $ valence <dbl> 0.3743333, 0.2460000, 0.9350000, 0.1620000, 0.5…
summary(artist_genres_clean)
## Artist ID Genre
## Length:34680 Length:34680 Length:34680
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## acousticness danceability energy loudness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :-57.436
## 1st Qu.:0.0462 1st Qu.:0.4634 1st Qu.:0.4450 1st Qu.:-10.815
## Median :0.2298 Median :0.5990 Median :0.6450 Median : -7.616
## Mean :0.3415 Mean :0.5744 Mean :0.5987 Mean : -9.453
## 3rd Qu.:0.5930 3rd Qu.:0.7090 3rd Qu.:0.7980 3rd Qu.: -5.652
## Max. :0.9960 Max. :0.9840 Max. :1.0000 Max. : 1.605
## liveness speechiness tempo valence
## Min. :0.0000 Min. :0.00000 Min. : 0.0 Min. :0.0000
## 1st Qu.:0.1007 1st Qu.:0.03900 1st Qu.:101.6 1st Qu.:0.2780
## Median :0.1340 Median :0.05350 Median :119.6 Median :0.4749
## Mean :0.1864 Mean :0.09443 Mean :119.4 Mean :0.4751
## 3rd Qu.:0.2232 3rd Qu.:0.09870 3rd Qu.:133.9 3rd Qu.:0.6690
## Max. :0.9990 Max. :0.96300 Max. :244.0 Max. :0.9920
The density ridges for the each audio features helps visualize how each genre sounds like.
ggplot(artist_genres_clean, aes(x = acousticness, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = danceability, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = energy, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = loudness, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = liveness, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = speechiness, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = tempo, y = Genre)) +
geom_density_ridges() +
theme_minimal()
ggplot(artist_genres_clean, aes(x = valence, y = Genre)) +
geom_density_ridges() +
theme_minimal()